{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# gbdt+lr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(569, 30)"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.datasets import load_breast_cancer\n",
    "X, y = load_breast_cancer(return_X_y=True)\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X,\n",
    "                                                    y,\n",
    "                                                    test_size=0.2,\n",
    "                                                    random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,\n",
       "                           learning_rate=0.1, loss='deviance', max_depth=2,\n",
       "                           max_features=None, max_leaf_nodes=None,\n",
       "                           min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                           min_samples_leaf=1, min_samples_split=2,\n",
       "                           min_weight_fraction_leaf=0.0, n_estimators=10,\n",
       "                           n_iter_no_change=None, presort='deprecated',\n",
       "                           random_state=None, subsample=1.0, tol=0.0001,\n",
       "                           validation_fraction=0.1, verbose=0,\n",
       "                           warm_start=False)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "mm = GradientBoostingClassifier(n_estimators=10, max_depth=2)\n",
    "mm.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GBDT AUC: 0.96856\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import roc_curve, auc, roc_auc_score\n",
    "\n",
    "y_pred = mm.predict_proba(X_test)[:, 1]\n",
    "\n",
    "print('GBDT AUC: %.5f' % roc_auc_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_leaves = mm.apply(X_train)[:, :, 0]\n",
    "train_rows, cols = X_train_leaves.shape\n",
    "X_test_leaves = mm.apply(X_test)[:, :, 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "\n",
    "enc = OneHotEncoder(categories='auto')\n",
    "\n",
    "X_trans = enc.fit_transform(\n",
    "    np.concatenate((X_train_leaves, X_test_leaves), axis=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GBDT+LR AUC: 0.97019\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "lr = LogisticRegression(solver='lbfgs')\n",
    "lr.fit(X_trans[:train_rows, :], y_train)\n",
    "\n",
    "y_pred_gbdtlr = lr.predict_proba(X_trans[train_rows:, :])[:, 1]\n",
    "\n",
    "print('GBDT+LR AUC: %.5f' % roc_auc_score(y_test, y_pred_gbdtlr))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Super Learner "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 自定义实现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "%run superLearnerexample.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-0.18912039, -1.33031363,  0.92165011])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 测试数据集\n",
    "from sklearn.datasets import make_regression\n",
    "\n",
    "random_state = 42\n",
    "X, y = make_regression(n_samples=1000,\n",
    "                       n_features=3,\n",
    "                       noise=1,\n",
    "                       random_state=random_state)\n",
    "X[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Super Learner: MSE 0.978\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn.ensemble import BaggingRegressor\n",
    "\n",
    "# 定义线性基模型\n",
    "base_models = [\n",
    "    LinearRegression(),\n",
    "    DecisionTreeRegressor(random_state=random_state),\n",
    "    BaggingRegressor(random_state=random_state)\n",
    "]\n",
    "\n",
    "# 定义元模型\n",
    "meta_model = LinearRegression()\n",
    "\n",
    "spl = SuperLearnerExample(X,\n",
    "                          y,\n",
    "                          base_models,\n",
    "                          meta_model,\n",
    "                          random_state=random_state)\n",
    "spl.fit()\n",
    "spl.evaluate_meta_models()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([9.97922299e-01, 2.42673881e-04, 2.14871114e-03])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spl.meta_model.coef_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用SuperLearner库实现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[MLENS] backend: threading\n"
     ]
    }
   ],
   "source": [
    "from mlens.ensemble import SuperLearner"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bulid_super_learner(X,\n",
    "                        y,\n",
    "                        base_models,\n",
    "                        meta_model,\n",
    "                        score_fun,\n",
    "                        kfolds=3,\n",
    "                        test_size=0.3,\n",
    "                        random_state=None):\n",
    "    X, X_val, y, y_val = train_test_split(X,\n",
    "                                          y,\n",
    "                                          test_size=test_size,\n",
    "                                          random_state=random_state)\n",
    "    \n",
    "    ensemble = SuperLearner(scorer=score_fun,\n",
    "                            folds=kfolds,\n",
    "                            shuffle=True,\n",
    "                            sample_size=len(X),\n",
    "                            random_state=random_state)\n",
    "    ensemble.add(base_models)\n",
    "    ensemble.add_meta(meta_model)\n",
    "    ensemble.fit(X, y)\n",
    "    preds = ensemble.predict(X_val)\n",
    "    print('Super Learner Train MSE {:.3f}'.format(mean_squared_error(y_val, preds)))\n",
    "    return ensemble"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Super Learner Train MSE 0.975\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn.ensemble import BaggingRegressor\n",
    "\n",
    "base_models = [\n",
    "    LinearRegression(),\n",
    "    DecisionTreeRegressor(random_state=random_state),\n",
    "    BaggingRegressor(random_state=random_state)\n",
    "]\n",
    "meta_model = LinearRegression()\n",
    "\n",
    "\n",
    "random_state = 42\n",
    "X, y = make_regression(n_samples=1000,\n",
    "                       n_features=3,\n",
    "                       noise=1,\n",
    "                       random_state=random_state)\n",
    "\n",
    "ensemble = bulid_super_learner(X,\n",
    "                               y,\n",
    "                               base_models,\n",
    "                               meta_model,\n",
    "                               mean_squared_error,\n",
    "                               random_state=random_state)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                  score-m  score-s  ft-m  ft-s  pt-m  pt-s\n",
      "layer-1  baggingregressor          680.29   102.88  0.06  0.00  0.00  0.00\n",
      "layer-1  decisiontreeregressor    1262.92   279.05  0.01  0.00  0.00  0.00\n",
      "layer-1  linearregression            1.00     0.02  0.00  0.00  0.00  0.00\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(ensemble.data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ensemble.layers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,\n",
       "    name='layer-1', propagate_features=None, raise_on_exception=True,\n",
       "    random_state=7270, shuffle=True,\n",
       "    stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,\n",
       "    indexer=FoldIndex(X=None, folds=3, raise_on_exception=True),\n",
       "    learners=[Learner(attr='predict', backend='threading', dtype=<class 'numpy.float32'>,\n",
       "     estimator=BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_feat...ed_error at 0x1217e3d90>)],\n",
       "    n_jobs=-1, name='group-0', raise_on_exception=True, transformers=[])],\n",
       "    verbose=0),\n",
       " Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,\n",
       "    name='layer-2', propagate_features=None, raise_on_exception=True,\n",
       "    random_state=7270, shuffle=True,\n",
       "    stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,\n",
       "    indexer=FullIndex(X=None),\n",
       "    learners=[Learner(attr='predict', backend='threading', dtype=<class 'numpy.float32'>,\n",
       "     estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),\n",
       "     indexer=FullIndex...ed_error at 0x1217e3d90>)],\n",
       "    n_jobs=-1, name='group-1', raise_on_exception=True, transformers=[])],\n",
       "    verbose=0)]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ensemble.layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SuperLearner(array_check=None, backend=None, folds=3,\n",
       "       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,\n",
       "   name='layer-1', propagate_features=None, raise_on_exception=True,\n",
       "   random_state=7270, shuffle=True,\n",
       "   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,\n",
       "   indexer=FoldIndex(X=None, folds=3, raise_on_exc...17e3d90>)],\n",
       "   n_jobs=-1, name='group-0', raise_on_exception=True, transformers=[])],\n",
       "   verbose=0)],\n",
       "       model_selection=False, n_jobs=None, raise_on_exception=True,\n",
       "       random_state=42, sample_size=700,\n",
       "       scorer=<function mean_squared_error at 0x1217e3d90>, shuffle=True,\n",
       "       verbose=False)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ensemble.remove(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,\n",
       "    name='layer-1', propagate_features=None, raise_on_exception=True,\n",
       "    random_state=7270, shuffle=True,\n",
       "    stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,\n",
       "    indexer=FoldIndex(X=None, folds=3, raise_on_exception=True),\n",
       "    learners=[Learner(attr='predict', backend='threading', dtype=<class 'numpy.float32'>,\n",
       "     estimator=BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_feat...ed_error at 0x1217e3d90>)],\n",
       "    n_jobs=-1, name='group-0', raise_on_exception=True, transformers=[])],\n",
       "    verbose=0)]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ensemble.layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
